/*
Content: Basic data preparation of IEB data - standard routines
Paper: "Immigrants Return Intentions and Labor Market Behavior when the Home Country is Unsafe"
Journal: Journal of Labor Economics
Authors: Jacopo Bassetto, Teresa Freitas-Monteiro
*/


/*
Note: our whole data preparation was divided in three parts which are eventually merged. 
This splitting was necessary because of the large data size.
*/
	
********************************************************************************
**# LOAD RAW IEB DATA WITH ONLY BASIC VARIABLES
********************************************************************************

use prs_id ieb_staat_num ieb_beg_orig using "$orig/originaldata.dta", clear

			
********************************************************************************		
* GENERATE BASIC VARIABLES OF NATIONALITY
********************************************************************************
rename prs_id persnr

* DIFFERENT TYPES OF NATIONALITY VARIABLES 

	* 1. FIRST NATIONALITY THAT COMES IN THE ADMIN DATA 

		gen nationality = ieb_staat_num 

		replace nationality = . if inlist(nationality, 999, 998, 997, 7, -9, -7, -5)		// I include also those who have the German nationality as first spell (7 = unbekannt, others non valid) 
		
		* give first valid value (not missing) to all observations
		bys persnr (ieb_beg_orig): gen whenfirst_nat = sum(!missing(nationality)) == 1
		
		bys persnr (ieb_beg_orig): egen nat_first = total(whenfirst_nat * nationality) 
 
		* drop intermediate variables
		cap drop whenfirst_nat nationality
	

	* 2. NATIONALITY IN THE FIRST VALID SPELL WITHOUT GERMANS (German spells are considered as invalid)
			
		gen nationality = ieb_staat_num 

		replace nationality = . if inlist(nationality, 999, 998, 997, 7, -9, -7, -5, 0)		// I include also those who have the German nationality as first spell
		
		* give first valid value (not missing) to all observations
		bys persnr (ieb_beg_orig): gen whenfirst_nat = sum(!missing(nationality)) == 1
		
		bys persnr (ieb_beg_orig): egen nat_first_noger = total(whenfirst_nat * nationality) 
		
		* Drop intermediate variables
		cap drop whenfirst_nat nationality 

	
	* SAMPLE 40% OF THE ORIGINAL 70% 
	
	keep persnr nat_first nat_first_noger
	keep if !inlist(nat_first,.,0)
	
	bys persnr: gen firstob = 1 if _n == 1
	keep if firstob == 1
	
	sample 40
	
save "$intermediate/40perc_ids_noger", replace
}


********************************************************************************
**# ATTACH THE 40% OF IDs TO THEIR FULL EMPLOYMENT HISTORY SPELLS
********************************************************************************


use "$orig/originaldata", clear

clonevar persnr = prs_id

merge m:1 persnr using "$intermediate/40perc_ids_noger", gen(match_40per) keep(3) keepusing(persnr nat_first nat_first_noger)

save "$intermediate/40perc_full_noger", replace

}


********************************************************************************
**# RUN ROUTINES FOR THE PREPARATION OF IEB DATA (info from parallel spells, non-overlapping spells, education imputation,ect)
********************************************************************************


clear
use "$data/40perc_full_noger", clear


* DIFFERENT TYPES OF EDUCATION VARIABLES 
	
	* EDUCATION IN THE FIRST VALID SPELL
		cap drop education whenfirst_edu
		
		* set to missing the values I want to consider as missing
		gen education = ieb_dba_id 
		gen education2 = ieb_dba_id
		
		replace education = 11 if inlist(education, 5,11, 14, 15, 17, 19,20)		// FH --> FH, included non-recognized
		replace education = 12 if inlist(education, 6,12,13,16,18,21,10,23,24,25,8)			// Uni --> Uni, included non-recognized
		replace education = 2  if inlist(education, 2,3,4,7,9)	// Berufsfachschule, Fachschule --> betr./außerbetr. Ausbildung, included non-recognized
		replace education = 1  if inlist(education, 1,22)	
				
				replace education = 3 if inlist(education, 11,12)
		
		replace education = . if inlist(education, 9997, 9998)
		
		* give first valid value (not missing) to all observations
		bys persnr (ieb_beg_orig): gen whenfirst_edu = sum(!missing(education)) == 1
		
		bys persnr (ieb_beg_orig): egen edu_first = total(whenfirst_edu * education) 
		
	
	* ALTERNATIVE VERSION OF EDUCATION IN THE FIRST SPELLL
		replace education2 = . if inlist(education, 9997, 9998)
		
		* give first valid value (not missing) to all observations
		bys persnr (ieb_beg_orig): gen whenfirst_edu2 = sum(!missing(education2)) == 1
		
		bys persnr (ieb_beg_orig): egen edu_first2 = total(whenfirst_edu2 * education2) 
	

	* MAX EDUCATION	
		bys persnr: egen edu_max = max(education)
		replace edu_max = 3 if inlist(edu_max,26,27)
		replace edu_max = 0 if edu_max == .

********************************************************************************

* DATE OF BIRTH VARIABLES 

	gen gebjahr = year(geb_dat)		// year of birth
	gen gebmonth = month(geb_dat)	// month of birth
	gen gebmy = ym(gebjahr, gebmonth)	// month-year of birth

	format gebmy %tm 
	
* FIRST AND LAST SPELLL

	bys persnr (ieb_end_orig): gen exit_spell = ieb_end_orig[_N]
	bys persnr (ieb_beg_orig): gen entry_spell = ieb_beg_orig[1]

	


********************************************************************************
* TRANSFORM VARIABLES SO THAT THEY EQUAL THE SIAB
********************************************************************************
*rename prs_id persnr 
rename betnr betnr 
rename ieb_quellverf_id quelle_gr 
rename ieb_beg_orig begorig
rename ieb_end_orig endorig 
rename ieb_beg_epi begepi 
rename ieb_end_epi endepi
rename ieb_tag_entg tentgelt
rename ieb_erw_stat erwstat
rename ieb_abg grund 
rename sex_id frau
rename ieb_dba_id ausbildung
rename ieb_spell spell
rename quelle_gr quelle


* Generate the teilzeit variable 
recode berufstellg_imp (8 9 21 = 1 "teilzeit") (else = 0) if berufstellg_imp >= 0, gen (teilzeit)

* Generate jahr variables
	gen jahr = year(begorig)
	
* Generate age varaible
	gen age = int((begorig-geb_dat)/365)
	

* Age at entry 
	gen age_entry = int((entry_spell-geb_dat)/365)
		
		

********************************************************************************		
* RUN ALL SIAB PREPARATION ROUTINES 
********************************************************************************

* path to modified version of '01_SIAB_bio.do' from FDZ-Methodenreport 06/2017 (Johanne Eberle & Alexandra Schmucker)
global fileEberleSchmucker "$prep\Eberle_Schmucker2017\01_SIAB_bio_MODIFIED_final.do"  // in order to make '01_SIAB_bio.do' compatible with our master-file the first and last lines of '01_SIAB_bio.do' have been commented out

* path to BHP
global fileBHP "\\IAB\dfs\017\Ablagen\D01700-IAB-Projekte\D01700-Projekte-FDZ\Datensaetze\_Endprodukte\BHP\BHP7519_v2\Grundgesamtheit"

* observation period
global minYear = 2000 // begin of your observation period		(default: 1975)
global maxYear = 2020	// end of your observation period		(default: 2014)



********************************************************************************
* 01) Split episodes that span over one year
********************************************************************************
do "$prep\01_split_episodes.do"


********************************************************************************
* 02) Generate biographical variables (Eberle & Schmucker)
********************************************************************************

cap log close
log using "$log\02_EberleSchmucker.log", replace

do "$fileEberleSchmucker"

cap log close


********************************************************************************
* 03) Imputation of the education variable based on Fitzenberger, Osikominu & Voelter (2008)
********************************************************************************

do "$prep\05_educ_imputation.do"


********************************************************************************
* 04) Add the contribution assessment ceiling
********************************************************************************
	
	* Create variable for the Land 
	
			cap drop ao_land 
			gen ao_bula = int(ieb_ao_krs_num/1000)
			tab ao_bula
			replace ao_bula = . if ao_bula == 0

do "$prep\06_wages_assessment_ceiling.do"


********************************************************************************
* 05) Add the marginal part-time income threshold and flag marginal wages
********************************************************************************
do "$prep\07_wages_marginal.do"


********************************************************************************
* 06) Deflate wages, marginal part-time income threshold and contribution assessment ceiling
********************************************************************************
do "$prep\08_wages_deflation.do"

********************************************************************************
* 10) Treat parallel episodes:
* Generate informatin on parallel episodes
* Keep only 'main' episode
********************************************************************************
	
do "$prep\10_parallel_episodes.do"


* Save intermediate dataset
save "$intermediate/40perc_full_noger_prepared.dta", replace
	
clear
}
	

	


	
